
clear all 
set more off
global dir "C:\Users\yuanzi\Quant Space&Trade Dropbox\Yuan Zi (资源)\1-Projects\0-RRed\1-Shipment cargo\Ais project"
cd "$dir\Get ship_port_to_port.dta\raw Data"


use ais_container_sorted_yz,clear

**  (1) Add date
gen timestamp=time_stamp_original
split timestamp, p("-" " " "T" ".")
*gen check=timestamp  //checked, there is a ship has entries all missing. we drop this obs later
drop timestamp5 timestamp
label var timestamp1 "year"
label var timestamp2 "month"
label var timestamp3 "day"
label var timestamp4 "time"
split timestamp4, p(":")

destring timestamp*,replace
drop if timestamp1==.  //corresponds to line 10, drop an entry with all obs that is missing
gen date=mdyhms(timestamp2,timestamp3,timestamp1,timestamp41,timestamp42,timestamp43) 
label var date "numerically saved date var"
format date %tc
drop timestamp*
save tempbackup,replace


** (2) Drop 1. Arrivals without departure port
      * 2. Arrival port not equal to next depature port (5 cases, 10obs)
	  * 3. Arrival of the next port equals depature port
use tempbackup, clear //drop arrival and departure is the same port
gsort ship_id time move_type
cap drop trip
gen trip=_n if move_type=="departure"
by ship_id: replace trip=trip[_n-1] if  trip==. &  move_type[_n-1]=="departure"
sum ship_id if trip==. //check missings. Those are cases start with arrivals -> zero missings
drop if trip==.  //zero missings

gsort ship_id time move_type
gen trip2=.
order trip*
bys ship_id: gen first=_n if move_type=="departure"
bys ship_id: replace trip2=trip[_n+1] if move_type=="arrival"
bys ship_id: replace trip2=trip2[_n-1] if move_type=="departure" & first!=1 & trip2==.

bys ship_id trip2: gen check=1 if port_id!=port_id[_n-1] & trip2==trip2[_n-1] & trip2!=.
bys ship_id trip2: replace check=1 if check[_n+1]==1
drop if check==1  //droped 5 trip2, 10 obs
drop check
		
gsort ship_id trip time move_type		
by ship_id trip: gen check=1 if port_id==port_id[_n-1]
by ship_id trip: replace check=1 if port_id==port_id[_n+1] 
sum ship_id if check==1    
drop if check==1 //drop obs that departure port =arrival port //111,710 dropped
drop check

merge m:1 port_id using port_list_full
keep if _merge==3
drop _merge
rename port_lon lon 
rename port_lat lat
save tempdata,replace


** (3) Re-create trip and trip2
use tempdata, clear //drop arrival and departure is the same port
gsort ship_id time move_type
cap drop trip
gen trip=_n if move_type=="departure"
by ship_id: replace trip=trip[_n-1] if  trip==. &  move_type[_n-1]=="departure"

gsort ship_id time move_type
cap drop trip2 
cap drop first
gen trip2=.
order trip*
bys ship_id: gen first=_n if move_type=="departure"
bys ship_id: replace trip2=trip[_n+1] if move_type=="arrival"
bys ship_id: replace trip2=trip2[_n-1] if move_type=="departure" & first!=1 & trip2==.
save, replace
		


** (3) Check irregular AD order at different timestamp
use tempdata,clear
gsort ship_id time move_type
gen check2=.
bys ship_id: replace check2=1 if move_type==move_type[_n+1] & move_type=="departure"   //i.e. ADD or AAD
bys ship_id: replace check2=1 if move_type==move_type[_n-1] & move_type=="arrival"  // take last depature as real depature, first arrival as real arrival. 
*sum check2 if check2==1	//0 cases with iregular AD order 
drop if check2==1
drop check2
save tempdata,replace //drop 0 cases
	
	
**(4) Check if achoarage are pairs, if not, adjust (if a ship arrive an anchoarage port, it should also leave an anchorage port)
use tempdata,clear
cap drop check
gsort ship_id trip2 time move_type			
by ship_id trip2: gen check=1 if  port_type!=port_type[_n-1] & trip2!=. & move_type=="departure"
by ship_id trip2: replace check=1 if check[_n+1]==1 & move_type=="arrival"
save,replace
  //checked, no mistakes in recording achorage
	
	
**(5)Check if Intransit are pairs, if not, adjust (if a ship arrive a port as In transit, it should also leave the port as In transit)
use tempdata,clear
cap drop check
gsort ship_id trip2 time move_type
by ship_id trip2: gen check=1 if  inTransit!=inTransit[_n-1]  & trip2!=.& move_type=="departure"
by ship_id trip2: replace check=1 if check[_n+1]==1  & trip2!=. & move_type=="arrival"
  //38443 cases， checked, most cases draught changes little, but we also have cases changes a lot. 
  // --> seems misreporting goes both ways
  //If draught didn't change, treat as inTransit
  //If draught changed or missing, treat as not inTransit
  //Alternatively, we can also treat all as not Intransit. This will not affect box accounting, just summary stats on # of travels will differ a bit	
	
	gsort ship_id  trip2 time  move_type
	by ship_id trip2: gen check2=(new_draught-new_draught[_n+1])/new_draught if check==1
	by ship_id trip2: replace check2=check2[_n-1] if check2==. &trip2!=.	&  check==1
	replace inT="true" if check==1 &check2==0
	replace inT="false" if check==1 & check2!=0
	
drop check* trip2
save,replace

** (6) Calculate travel hours again
use tempdata,clear
gsort ship_id time move_type
bys trip: gen travel_hs=hours(date-date[_n-1])
bys trip: replace travel_hs=travel_hs[_n+1] if travel_hs==.

cap drop first 
save,replace 


save ais_container_cleaned,replace  // this is the container data including anchorage, intransit, but adjusted for other irregularities 


save "$dir\Repository\data_intermediate\ais_container_cleaned",replace 

!del *tmp*.dta
!del *temp*.dta
